Import Dataset

data <- read.csv("D:/WinSem_21-22/EDA/PROJECT/Employee_Attrition_Prediction_and_Analysis-main/HR_Employee_Data.csv")

Summary of the Data

head(data)
str(data)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : chr  "38%" "80%" "11%" "72%" ...
##  $ last_evaluation      : chr  "53%" "86%" "88%" "87%" ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
library(dplyr) 
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
glimpse(data)
## Rows: 14,999
## Columns: 11
## $ ï..Emp_Id             <chr> "IND02438", "IND28133", "IND07164", "IND30478", ~
## $ satisfaction_level    <chr> "38%", "80%", "11%", "72%", "37%", "41%", "10%",~
## $ last_evaluation       <chr> "53%", "86%", "88%", "87%", "52%", "50%", "77%",~
## $ number_project        <int> 2, 5, 7, 5, 2, 2, 6, 5, 5, 2, 2, 6, 4, 2, 2, 2, ~
## $ average_montly_hours  <int> 157, 262, 272, 223, 159, 153, 247, 259, 224, 142~
## $ time_spend_company    <int> 3, 6, 4, 5, 3, 3, 4, 5, 5, 3, 3, 4, 5, 3, 3, 3, ~
## $ Work_accident         <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ left                  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ promotion_last_5years <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ Department            <chr> "sales", "sales", "sales", "sales", "sales", "sa~
## $ salary                <chr> "low", "medium", "medium", "low", "low", "low", ~
summary(data)
##   ï..Emp_Id         satisfaction_level last_evaluation    number_project 
##  Length:14999       Length:14999       Length:14999       Min.   :2.000  
##  Class :character   Class :character   Class :character   1st Qu.:3.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.803  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :7.000  
##  average_montly_hours time_spend_company Work_accident         left       
##  Min.   : 96.0        Min.   : 2.000     Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:156.0        1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :200.0        Median : 3.000     Median :0.0000   Median :0.0000  
##  Mean   :201.1        Mean   : 3.498     Mean   :0.1446   Mean   :0.2381  
##  3rd Qu.:245.0        3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :310.0        Max.   :10.000     Max.   :1.0000   Max.   :1.0000  
##  promotion_last_5years  Department           salary         
##  Min.   :0.00000       Length:14999       Length:14999      
##  1st Qu.:0.00000       Class :character   Class :character  
##  Median :0.00000       Mode  :character   Mode  :character  
##  Mean   :0.02127                                            
##  3rd Qu.:0.00000                                            
##  Max.   :1.00000

Check for Null Values

cbind(lapply(lapply(data, is.na), sum))
##                       [,1]
## ï..Emp_Id             0   
## satisfaction_level    0   
## last_evaluation       0   
## number_project        0   
## average_montly_hours  0   
## time_spend_company    0   
## Work_accident         0   
## left                  0   
## promotion_last_5years 0   
## Department            0   
## salary                0
sum(is.na(data))
## [1] 0

Data Cleaning

data$satisfaction_level<-gsub("%","",as.character(data$satisfaction_level))
data$satisfaction_level=as.integer(data$satisfaction_level)
head(data)
data$last_evaluation<-gsub("%","",as.character(data$last_evaluation))
data$last_evaluation=as.integer(data$last_evaluation)
head(data)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## v readr   2.0.1
## Warning: package 'ggplot2' was built under R version 4.1.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 4.1.1
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(dplyr)
library(plotly)
library(hrbrthemes)
## Warning: package 'hrbrthemes' was built under R version 4.1.1
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.1.1
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.1
library(caret)
## Warning: package 'caret' was built under R version 4.1.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(kernlab)
## Warning: package 'kernlab' was built under R version 4.1.3
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(caTools)
## Warning: package 'caTools' was built under R version 4.1.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.1.3
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
df <- data
head(df)
str(df)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
data <- df%>%
  group_by(Department)%>%
  summarize(Avg_hrs = mean(average_montly_hours))

fig <- plot_ly(data, x = ~Department, y = ~Avg_hrs, type = 'bar', color = I("dark blue"))
fig <- fig %>% layout(title = "Average monthly working hours according to department",
         xaxis = list(title = "Department"),
         yaxis = list(title = "Average monthly working hours"))

fig
#ggplot(df,aes(x=number_project,y=average_montly_hours))+geom_jitter(aes(color=Department))

data <- df%>%
  filter(Work_accident==1)%>%
  group_by(Department)%>%
  summarize(No_of_wa = n())%>%
  arrange(No_of_wa)
head(data)
hc <- data %>% 
  hchart('line', hcaes(x = Department, y = No_of_wa))%>%
  hc_title(text = "Number of work accidents for each department")%>%
  hc_yAxis(title = "Number of work accidents")
hc
l <- df %>% filter(salary == "low")
m <- df %>% filter(salary == "medium")
h <- df %>% filter(salary == "high")

hc2 <- hchart(
  density(l$satisfaction_level), type = "area", 
  color = "steelblue", name = "Low Salary"
  ) %>%
  hc_add_series(
    density(m$satisfaction_level), type = "area",
    color = "#B71C1C", 
    name = "Medium Salary"
    )%>%
  hc_add_series(
    density(h$satisfaction_level), type = "area",
    color = "yellow", 
    name = "High Salary"
    )%>%
  hc_title(text = "Density plot of satisfaction level according to salary")%>%
  hc_xAxis(title = "Satisfaction Level (0-100)")
hc2
fig <- plot_ly(df, labels = ~Department, values = ~time_spend_company, type = 'pie')
fig <- fig %>% layout(title = 'Time spent per Department',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

fig
df2 <- df
df2$left <- as.factor(df2$left)
str(df2)
## 'data.frame':    14999 obs. of  11 variables:
##  $ ï..Emp_Id            : chr  "IND02438" "IND28133" "IND07164" "IND30478" ...
##  $ satisfaction_level   : int  38 80 11 72 37 41 10 92 89 42 ...
##  $ last_evaluation      : int  53 86 88 87 52 50 77 85 100 53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
df2$ï..Emp_Id <- NULL

split <- sample.split(df2, SplitRatio = 0.7)
split
##  [1] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
train <- subset(df2, split == "TRUE")
test <- subset(df2, split == "FALSE")

classifier = svm(formula = left ~ .,
                 data = train,
                 type = 'C-classification',
                 kernel = 'linear')

y_pred = predict(classifier, newdata = test[-7])
y_train_pred = predict(classifier, newdata = train[-7])

cm = table(test[, 7], y_pred)
cm
##    y_pred
##        0    1
##   0 3225  203
##   1  798  273
cm2 = table(train[, 7], y_train_pred )
cm2
##    y_train_pred
##        0    1
##   0 7520  480
##   1 1859  641
# Splitting data in train and test data

# Fitting Random Forest to the train dataset
set.seed(120)  # Setting seed
classifier_RF = randomForest(x = train[-7],
                             y = train$left,
                             ntree = 50)

classifier_RF
## 
## Call:
##  randomForest(x = train[-7], y = train$left, ntree = 50) 
##                Type of random forest: classification
##                      Number of trees: 50
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 0.88%
## Confusion matrix:
##      0    1 class.error
## 0 7982   18     0.00225
## 1   74 2426     0.02960
# Predicting the Test set results
y_pred = predict(classifier_RF, newdata = test[-7])

# Confusion Matrix
confusion_mtx = table(test[, 7], y_pred)
confusion_mtx
##    y_pred
##        0    1
##   0 3417   11
##   1   78  993
# Plotting model
plot(classifier_RF)

# Importance plot
importance(classifier_RF)
##                       MeanDecreaseGini
## satisfaction_level         1302.277501
## last_evaluation             447.775675
## number_project              708.454623
## average_montly_hours        530.746767
## time_spend_company          697.367202
## Work_accident                21.846892
## promotion_last_5years         3.833769
## Department                   42.274183
## salary                       30.887432
# Variable importance plot
varImpPlot(classifier_RF)